1 package org.apache.lucene.analysis.in;
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 import java.util.BitSet;
21 import java.util.IdentityHashMap;
22 import static java.lang.Character.UnicodeBlock.*;
23 import static org.apache.lucene.analysis.util.StemmerUtil.*;
24
25
26
27
28
29
30
31
32 public class IndicNormalizer {
33
34 private static class ScriptData {
35 final int flag;
36 final int base;
37 BitSet decompMask;
38
39 ScriptData(int flag, int base) {
40 this.flag = flag;
41 this.base = base;
42 }
43 }
44
45 private static final IdentityHashMap<Character.UnicodeBlock,ScriptData> scripts =
46 new IdentityHashMap<>(9);
47
48 private static int flag(Character.UnicodeBlock ub) {
49 return scripts.get(ub).flag;
50 }
51
52 static {
53 scripts.put(DEVANAGARI, new ScriptData(1, 0x0900));
54 scripts.put(BENGALI, new ScriptData(2, 0x0980));
55 scripts.put(GURMUKHI, new ScriptData(4, 0x0A00));
56 scripts.put(GUJARATI, new ScriptData(8, 0x0A80));
57 scripts.put(ORIYA, new ScriptData(16, 0x0B00));
58 scripts.put(TAMIL, new ScriptData(32, 0x0B80));
59 scripts.put(TELUGU, new ScriptData(64, 0x0C00));
60 scripts.put(KANNADA, new ScriptData(128, 0x0C80));
61 scripts.put(MALAYALAM, new ScriptData(256, 0x0D00));
62 }
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77 private static final int decompositions[][] = {
78
79 { 0x05, 0x3E, 0x45, 0x11, flag(DEVANAGARI) | flag(GUJARATI) },
80
81 { 0x05, 0x3E, 0x46, 0x12, flag(DEVANAGARI) },
82
83 { 0x05, 0x3E, 0x47, 0x13, flag(DEVANAGARI) | flag(GUJARATI) },
84
85 { 0x05, 0x3E, 0x48, 0x14, flag(DEVANAGARI) | flag(GUJARATI) },
86
87 { 0x05, 0x3E, -1, 0x06, flag(DEVANAGARI) | flag(BENGALI) | flag(GURMUKHI) | flag(GUJARATI) | flag(ORIYA) },
88
89 { 0x05, 0x45, -1, 0x72, flag(DEVANAGARI) },
90
91 { 0x05, 0x45, -1, 0x0D, flag(GUJARATI) },
92
93 { 0x05, 0x46, -1, 0x04, flag(DEVANAGARI) },
94
95 { 0x05, 0x47, -1, 0x0F, flag(GUJARATI) },
96
97 { 0x05, 0x48, -1, 0x10, flag(GURMUKHI) | flag(GUJARATI) },
98
99 { 0x05, 0x49, -1, 0x11, flag(DEVANAGARI) | flag(GUJARATI) },
100
101 { 0x05, 0x4A, -1, 0x12, flag(DEVANAGARI) },
102
103 { 0x05, 0x4B, -1, 0x13, flag(DEVANAGARI) | flag(GUJARATI) },
104
105 { 0x05, 0x4C, -1, 0x14, flag(DEVANAGARI) | flag(GURMUKHI) | flag(GUJARATI) },
106
107 { 0x06, 0x45, -1, 0x11, flag(DEVANAGARI) | flag(GUJARATI) },
108
109 { 0x06, 0x46, -1, 0x12, flag(DEVANAGARI) },
110
111 { 0x06, 0x47, -1, 0x13, flag(DEVANAGARI) | flag(GUJARATI) },
112
113 { 0x06, 0x48, -1, 0x14, flag(DEVANAGARI) | flag(GUJARATI) },
114
115 { 0x07, 0x57, -1, 0x08, flag(MALAYALAM) },
116
117 { 0x09, 0x41, -1, 0x0A, flag(DEVANAGARI) },
118
119 { 0x09, 0x57, -1, 0x0A, flag(TAMIL) | flag(MALAYALAM) },
120
121 { 0x0E, 0x46, -1, 0x10, flag(MALAYALAM) },
122
123 { 0x0F, 0x45, -1, 0x0D, flag(DEVANAGARI) },
124
125 { 0x0F, 0x46, -1, 0x0E, flag(DEVANAGARI) },
126
127 { 0x0F, 0x47, -1, 0x10, flag(DEVANAGARI) },
128
129 { 0x0F, 0x57, -1, 0x10, flag(ORIYA) },
130
131 { 0x12, 0x3E, -1, 0x13, flag(MALAYALAM) },
132
133 { 0x12, 0x4C, -1, 0x14, flag(TELUGU) | flag(KANNADA) },
134
135 { 0x12, 0x55, -1, 0x13, flag(TELUGU) },
136
137 { 0x12, 0x57, -1, 0x14, flag(TAMIL) | flag(MALAYALAM) },
138
139 { 0x13, 0x57, -1, 0x14, flag(ORIYA) },
140
141 { 0x15, 0x3C, -1, 0x58, flag(DEVANAGARI) },
142
143 { 0x16, 0x3C, -1, 0x59, flag(DEVANAGARI) | flag(GURMUKHI) },
144
145 { 0x17, 0x3C, -1, 0x5A, flag(DEVANAGARI) | flag(GURMUKHI) },
146
147 { 0x1C, 0x3C, -1, 0x5B, flag(DEVANAGARI) | flag(GURMUKHI) },
148
149 { 0x21, 0x3C, -1, 0x5C, flag(DEVANAGARI) | flag(BENGALI) | flag(ORIYA) },
150
151 { 0x22, 0x3C, -1, 0x5D, flag(DEVANAGARI) | flag(BENGALI) | flag(ORIYA) },
152
153 { 0x23, 0x4D, 0xFF, 0x7A, flag(MALAYALAM) },
154
155 { 0x24, 0x4D, 0xFF, 0x4E, flag(BENGALI) },
156
157 { 0x28, 0x3C, -1, 0x29, flag(DEVANAGARI) },
158
159 { 0x28, 0x4D, 0xFF, 0x7B, flag(MALAYALAM) },
160
161 { 0x2B, 0x3C, -1, 0x5E, flag(DEVANAGARI) | flag(GURMUKHI) },
162
163 { 0x2F, 0x3C, -1, 0x5F, flag(DEVANAGARI) | flag(BENGALI) },
164
165 { 0x2C, 0x41, 0x41, 0x0B, flag(TELUGU) },
166
167 { 0x30, 0x3C, -1, 0x31, flag(DEVANAGARI) },
168
169 { 0x30, 0x4D, 0xFF, 0x7C, flag(MALAYALAM) },
170
171 { 0x32, 0x4D, 0xFF, 0x7D, flag(MALAYALAM) },
172
173 { 0x33, 0x3C, -1, 0x34, flag(DEVANAGARI) },
174
175 { 0x33, 0x4D, 0xFF, 0x7E, flag(MALAYALAM) },
176
177 { 0x35, 0x41, -1, 0x2E, flag(TELUGU) },
178
179 { 0x3E, 0x45, -1, 0x49, flag(DEVANAGARI) | flag(GUJARATI) },
180
181 { 0x3E, 0x46, -1, 0x4A, flag(DEVANAGARI) },
182
183 { 0x3E, 0x47, -1, 0x4B, flag(DEVANAGARI) | flag(GUJARATI) },
184
185 { 0x3E, 0x48, -1, 0x4C, flag(DEVANAGARI) | flag(GUJARATI) },
186
187 { 0x3F, 0x55, -1, 0x40, flag(KANNADA) },
188
189 { 0x41, 0x41, -1, 0x42, flag(GURMUKHI) },
190
191 { 0x46, 0x3E, -1, 0x4A, flag(TAMIL) | flag(MALAYALAM) },
192
193 { 0x46, 0x42, 0x55, 0x4B, flag(KANNADA) },
194
195 { 0x46, 0x42, -1, 0x4A, flag(KANNADA) },
196
197 { 0x46, 0x46, -1, 0x48, flag(MALAYALAM) },
198
199 { 0x46, 0x55, -1, 0x47, flag(TELUGU) | flag(KANNADA) },
200
201 { 0x46, 0x56, -1, 0x48, flag(TELUGU) | flag(KANNADA) },
202
203 { 0x46, 0x57, -1, 0x4C, flag(TAMIL) | flag(MALAYALAM) },
204
205 { 0x47, 0x3E, -1, 0x4B, flag(BENGALI) | flag(ORIYA) | flag(TAMIL) | flag(MALAYALAM) },
206
207 { 0x47, 0x57, -1, 0x4C, flag(BENGALI) | flag(ORIYA) },
208
209 { 0x4A, 0x55, -1, 0x4B, flag(KANNADA) },
210
211 { 0x72, 0x3F, -1, 0x07, flag(GURMUKHI) },
212
213 { 0x72, 0x40, -1, 0x08, flag(GURMUKHI) },
214
215 { 0x72, 0x47, -1, 0x0F, flag(GURMUKHI) },
216
217 { 0x73, 0x41, -1, 0x09, flag(GURMUKHI) },
218
219 { 0x73, 0x42, -1, 0x0A, flag(GURMUKHI) },
220
221 { 0x73, 0x4B, -1, 0x13, flag(GURMUKHI) },
222 };
223
224 static {
225 for (ScriptData sd : scripts.values()) {
226 sd.decompMask = new BitSet(0x7F);
227 for (int i = 0; i < decompositions.length; i++) {
228 final int ch = decompositions[i][0];
229 final int flags = decompositions[i][4];
230 if ((flags & sd.flag) != 0)
231 sd.decompMask.set(ch);
232 }
233 }
234 }
235
236
237
238
239
240
241
242
243
244 public int normalize(char text[], int len) {
245 for (int i = 0; i < len; i++) {
246 final Character.UnicodeBlock block = Character.UnicodeBlock.of(text[i]);
247 final ScriptData sd = scripts.get(block);
248 if (sd != null) {
249 final int ch = text[i] - sd.base;
250 if (sd.decompMask.get(ch))
251 len = compose(ch, block, sd, text, i, len);
252 }
253 }
254 return len;
255 }
256
257
258
259
260 private int compose(int ch0, Character.UnicodeBlock block0, ScriptData sd,
261 char text[], int pos, int len) {
262 if (pos + 1 >= len)
263 return len;
264
265 final int ch1 = text[pos + 1] - sd.base;
266 final Character.UnicodeBlock block1 = Character.UnicodeBlock.of(text[pos + 1]);
267 if (block1 != block0)
268 return len;
269
270 int ch2 = -1;
271
272 if (pos + 2 < len) {
273 ch2 = text[pos + 2] - sd.base;
274 Character.UnicodeBlock block2 = Character.UnicodeBlock.of(text[pos + 2]);
275 if (text[pos + 2] == '\u200D')
276 ch2 = 0xFF;
277 else if (block2 != block1)
278 ch2 = -1;
279 }
280
281 for (int i = 0; i < decompositions.length; i++)
282 if (decompositions[i][0] == ch0 && (decompositions[i][4] & sd.flag) != 0) {
283 if (decompositions[i][1] == ch1 && (decompositions[i][2] < 0 || decompositions[i][2] == ch2)) {
284 text[pos] = (char) (sd.base + decompositions[i][3]);
285 len = delete(text, pos + 1, len);
286 if (decompositions[i][2] >= 0)
287 len = delete(text, pos + 1, len);
288 return len;
289 }
290 }
291
292 return len;
293 }
294 }